from __future__ import division, absolute_import, print_function
%load_ext autoreload
%autoreload 2
import os
print(os.getcwd())
#import hepran
#import hepran.bzipscore as bz
#import hepran.bcipa as bc
#import hepran.utils as u
#import hepran.registers as r
#import hepran.agadir as ag
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib as mpl
import matplotlib.pylab as plt
import sklearn as sk
CCO = os.environ.get("CCO",r"C:\Projekti_KI\ortoCC\CoiledCoilOrtho")
SFD = os.environ.get("SFD",r"D:\data\ortoCC\design-with-alignments\4heptade-7\!OUT_bcf")
from score_utils import *
from interactive_set_plot import *
import bokeh
import bokeh.resources
import bokeh.plotting as bp
from bokeh.models import HoverTool
from IPython.core.display import display, HTML
import hepran
bp.output_notebook()
import sklearn
from sklearn import linear_model
from sklearn.externals import joblib
from IPython.display import display, HTML
%%time
df = pd.read_csv('data/DNA_round0_fiting.csv', index_col=[0,1])
df = df.apply(get_CC_features, axis=1)
#df.rename(columns={'RD_Tm':'Tm'}, inplace=True)
df['Tm']=df.RD_Tm
df['norm_sd_RD'] = df.sd_RD/df.sd_RD.mean()
df['cv_RD'] = df.sd_RD/df.mean_RD
fit_metrics_all = get_metrics_df().set_index('N_iter fit_type fit_class'.split())
model_features_all = get_features_df().set_index("N_iter fit_type".split())
def set_weights(name, df):
df['weights'] = 1
if name == "W10L15H55":
df['weights'] = 1
df.loc[df.Tm < 15, 'weights'] = 10
df.loc[df.Tm > 55, 'weights'] = 10
df.loc[df.on_target == True, 'weights'] = 10
if name == "W10L15H50":
df['weights'] = 1
df.loc[df.Tm < 15, 'weights'] = 10
df.loc[df.Tm > 50, 'weights'] = 10
df.loc[df.on_target == True, 'weights'] = 10
if name == "W10":
df.loc[df.on_target == True, 'weights'] = 10
if name == "W10L":
df['weights'] = 1
df.loc[df.Tm < 15, 'weights'] = 10
df.loc[df.on_target == True, 'weights'] = 10
if name == "WsdRD":
df['weights'] = 8/df.norm_sd_RD
if name == "WsdRD10":
df['weights'] = 8/df.norm_sd_RD
df.loc[df.on_target == True, 'weights'] = 10*df.loc[df.on_target == True, 'weights']
if name == "WcvRD10":
df['weights'] = 8/df.cv_RD
df.loc[df.on_target == True, 'weights'] = 10*df.loc[df.on_target == True, 'weights']
if name == "WcvRD":
df['weights'] = 8/df.cv_RD
if name == "WbnRD10":
df['weights'] = df.bcnum
df.loc[df.on_target == True, 'weights'] = 10*df.loc[df.on_target == True, 'weights']
if name == "WbnRD":
df['weights'] = df.bcnum
extra_cols="RD_Tm ln_mean_RD bcnum sd_RD norm_sd_RD cv_RD".split()
fit_type_string = "basic-rep"
lm_type = "Ridge" #Ridge, ElasticNet, SGDRegressor, BayesianRidge
weight_string="WbnRD10" #W1 W10 W10L15H50 W10L15H55
target_field = 'Tm'
set_name = 'ALL'
#read ENV vars if they are set
fit_type_string = os.environ.get('fit_type_string', fit_type_string)
lm_type = os.environ.get('lm_type', lm_type)
weight_string = os.environ.get('weight_string', weight_string)
set_name = os.environ.get('set_name', set_name)
print("fit_type_string =", fit_type_string)
print("lm_type =", lm_type)
print("weight_string =", weight_string)
print("set_name =", set_name)
##filter per set
if set_name != "ALL":
ids = u.get_ids_from_pairs(u.load_set_file(SFD+'/'+set_name))
df = df.query('(ID1 in @ids) and (ID2 in @ids)')
#df['weights'] = 1/df.cv_RD**2
#df.plot.scatter("Tm","score")
tooltips = [
('ID1, ID2', '@IDs'),
('RD_Tm', '@RD_Tm'),
('score', '@score'),
('weights', '@weights'),
('ln_mean_RD', '@ln_mean_RD'),
('bcnum', '@bcnum'),
('sd_RD', '@sd_RD'),
('cv_RD', '@cv_RD'),
('seq1', '@seq1_disp{safe}'),
('seq2', '@seq2_disp{safe}'),
]
fit_type = fit_type_string
fit_fields = fit_fields_dic[fit_type_string]
fit_type = 'DNA-'+set_name.replace('.set','') + "-" +fit_type
set_weights(weight_string, df)
Q=df
N_iter = 0
print("Iteration: ", N_iter)
print("Num points: ", len(Q))
lm, R2 = make_model(target_field, fit_fields, Q, lm_type)
fit_type += "-" + lm_type
if weight_string:
fit_type += "-" + weight_string
df['score'] = lm.predict(df[fit_fields])
df['pos'] = 0
df = df.apply(get_formated_seq, axis=1)
df['IDs']=df.index
title = str(N_iter)+"_"+fit_type
p = draw_scatter_interactive(target_field, 'score', df, y_range=(0,80),
title=title, save_to_file=False, tooltips=tooltips)
bp.show(p)
#mpl_plot_fit(title, df)
fit_metric = get_FIT_dataframe(Q, lm, N_iter, fit_type, N_feat=len(lm.coef_)+1, N_samples=len(Q))
fit_metrics_all = fit_metrics_all.append(fit_metric)
display(fit_metric)
model_features = get_model_features(lm, fit_fields, N_iter, fit_type)
model_features_all = model_features_all.append(model_features)
display(model_features)
joblib.dump(lm, 'models/{title}.model'.format(**locals()));
df.to_excel('models/{title}.score.xlsx'.format(**locals()));
bp.save(p, title=title, filename='models\\{title}.plot.html'.format(**locals()), resources=bokeh.resources.INLINE);
N_iter = 1
seqs = df
%run -i 08_aligned_fit_DO_ITER.py
seqs['seq1'] = df['seq1']
seqs['seq2'] = df['seq2']
N_iter += 1
%run -i 08_aligned_fit_DO_ITER.py
seqs['seq1'] = df['seq1']
seqs['seq2'] = df['seq2']
N_iter += 1
%run -i 08_aligned_fit_DO_ITER.py
seqs['seq1'] = df['seq1']
seqs['seq2'] = df['seq2']
N_iter += 1
%run -i 08_aligned_fit_DO_ITER.py
seqs['seq1'] = df['seq1']
seqs['seq2'] = df['seq2']
N_iter += 1
%run -i 08_aligned_fit_DO_ITER.py
fit_metrics_all.to_csv('models/{fit_type}.metrics.csv'.format(**locals()) )
model_features_all.to_csv('models/{fit_type}.features.csv'.format(**locals()) )
fit_type